For this project I’m using a fascinating dataset from data.world, with analysis and viz in R Studio.

Thanks to @wnedds and @quanticdata for posting this project.

The data sources are: Probability of Automation: http://www.oxfordmartin.ox.ac.uk/downloads/academic/The_Future_of_Employment.pdf - from 2013 Job Numbers: Bureau of Labor Statistics. Note: Jobs where data was not available or there were less than 10 employees were marked as zero. Salary data: https://www.bls.gov/oes/current/oes_nat.htm

First, let’s get the data:

library(dplyr) # I love the tidyverse, thank you Hadley Wickham and co.
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(viridis)
## Loading required package: viridisLite
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(data.world)
## Loading required package: dwapi
## 
## Attaching package: 'dwapi'
## The following object is masked from 'package:dplyr':
## 
##     sql
# you'll need your own API token to run this part. It's cool, they're free.
#data.world::set_config(saved_cfg)
sql_stmt <- qry_sql("SELECT --This selects the fields listed below for analysis.
       national_dl.a_median,
       raw_state_automation_data.soc,
       raw_state_automation_data.occupation,
       raw_state_automation_data.probability,
       raw_state_automation_data.alabama,
       raw_state_automation_data.alaska,
       raw_state_automation_data.arizona,
       raw_state_automation_data.arkansas,
       raw_state_automation_data.california,
       raw_state_automation_data.colorado,
       raw_state_automation_data.connecticut,
       raw_state_automation_data.delaware,
       raw_state_automation_data.district_of_columbia,
       raw_state_automation_data.florida,
       raw_state_automation_data.georgia,
       raw_state_automation_data.hawaii,
       raw_state_automation_data.idaho,
       raw_state_automation_data.illinois,
       raw_state_automation_data.indiana,
       raw_state_automation_data.iowa,
       raw_state_automation_data.kansas,
       raw_state_automation_data.kentucky,
       raw_state_automation_data.louisiana,
       raw_state_automation_data.maine,
       raw_state_automation_data.maryland,
       raw_state_automation_data.massachusetts,
       raw_state_automation_data.michigan,
       raw_state_automation_data.minnesota,
       raw_state_automation_data.mississippi,
       raw_state_automation_data.missouri,
       raw_state_automation_data.montana,
       raw_state_automation_data.nebraska,
       raw_state_automation_data.nevada,
       raw_state_automation_data.new_hampshire,
       raw_state_automation_data.new_jersey,
       raw_state_automation_data.new_mexico,
       raw_state_automation_data.new_york,
       raw_state_automation_data.north_carolina,
       raw_state_automation_data.north_dakota,
       raw_state_automation_data.ohio,
       raw_state_automation_data.oklahoma,
       raw_state_automation_data.oregon,
       raw_state_automation_data.pennsylvania,
       raw_state_automation_data.rhode_island,
       raw_state_automation_data.south_carolina,
       raw_state_automation_data.south_dakota,
       raw_state_automation_data.tennessee,
       raw_state_automation_data.texas,
       raw_state_automation_data.utah,
       raw_state_automation_data.vermont,
       raw_state_automation_data.virginia,
       raw_state_automation_data.washington,
       raw_state_automation_data.west_virginia,
       raw_state_automation_data.wisconsin,
       raw_state_automation_data.wyoming
       FROM raw_state_automation_data JOIN national_dl ON national_dl.occ_code = raw_state_automation_data.soc")
dwapi::configure()
df <- data.world::query(sql_stmt, "quanticdata/occupation-and-salary-by-state-and-likelihood-of-automation")
head(df)
## # A tibble: 6 x 55
##   a_median soc   occupation probability alabama alaska arizona arkansas
##   <chr>    <chr> <chr>            <dbl>   <int>  <int>   <int>    <int>
## 1 59020    13-1~ Training ~       0.014    2670      0    7710     2740
## 2 51800    47-2~ Structura~       0.83     1340    180    1460      930
## 3 30570    47-3~ Helpers–B~       0.83      340      0     580      230
## 4 28810    47-3~ Helpers–C~       0.92      680    250     300      270
## 5 29530    47-3~ Helpers–E~       0.74     1950    220     560      300
## 6 27310    47-3~ Helpers–P~       0.94      220     60     300      280
## # ... with 47 more variables: california <int>, colorado <int>,
## #   connecticut <int>, delaware <int>, district_of_columbia <int>,
## #   florida <int>, georgia <int>, hawaii <int>, idaho <int>,
## #   illinois <int>, indiana <int>, iowa <int>, kansas <int>,
## #   kentucky <int>, louisiana <int>, maine <int>, maryland <int>,
## #   massachusetts <int>, michigan <int>, minnesota <int>,
## #   mississippi <int>, missouri <int>, montana <int>, nebraska <int>,
## #   nevada <int>, new_hampshire <int>, new_jersey <int>, new_mexico <int>,
## #   new_york <int>, north_carolina <int>, north_dakota <int>, ohio <int>,
## #   oklahoma <int>, oregon <int>, pennsylvania <int>, rhode_island <int>,
## #   south_carolina <int>, south_dakota <int>, tennessee <int>,
## #   texas <int>, utah <int>, vermont <int>, virginia <int>,
## #   washington <int>, west_virginia <int>, wisconsin <int>, wyoming <int>

Great, I have df containing median annual income (a_median), the occupation code (soc) used to join the tables, the occupation name, the probability of a particular occupation getting automated, and the estimated # of workers in occupation by state. It should be noted at the start that I’m using the median annual income here. Look at the source dataset, there’s more detailed quartile values available, which could be great for getting more accurate estimates.

Before we start doing calculations, it’s good to some exploratory data analysis (EDA).

First, I want to calculate the total number of workers in a given occupation across the US:

# oops, before I do that, the a_median is missing some data and got read in as a character vector. I gotta fix that. 
# some of the a_median rows are missing data, so I will replace with 0 for now
df <- transform(df, a_median = as.numeric(a_median))
## Warning in eval(substitute(list(...)), `_data`, parent.frame()): NAs
## introduced by coercion
df$a_median[is.na(df$a_median)] <- 0

# sum over all states into new column 'total_US'
df$total_US=rowSums(df[,5:55])

# Let's look at some of the numbers associated with jobs in this data. First, let's do job frequency:
# Define new df for ease of handling
US_jobs = df[,c('occupation','probability','total_US','a_median')]

It might be informative to look at the histograms and ecdf curves and get a feel for the data.

# I want to plot several plots together, and I'll probably do it again later, so it's worth calling the multiplot function:
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  require(grid)

  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)

  numPlots = length(plots)

  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                    ncol = cols, nrow = ceiling(numPlots/cols))
  }

 if (numPlots==1) {
    print(plots[[1]])

  } else {
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}

# histogram
hist(US_jobs$probability,
     main = 'Histogram of Risk of Automation',
     xlab = 'Risk of Automation')

# Define ecdf plot
ecdf_plot <- function(x, title, xlab, ylab) {
  Fn <- ecdf(x)
  plot(x,Fn(x), xlab = xlab, ylab = ylab, main = title)
}
# call plot
ecdf_plot(US_jobs$probability,'Risk of Automation ECDF','Risk of Automation','ecdf')

The probability is skewed to the extremes, with very little in between. I don’t know how realistic that is, so it’ll hard to draw concrete conclusions.

# Get occupations with highest risk of automation, look at the total workers across US, and the median annual income:
high_risk = US_jobs %>%
  arrange(desc(probability)) %>%
  slice(1:10)
high_risk
##                                                       occupation
## 1                                                Watch Repairers
## 2                                         Insurance Underwriters
## 3                                                   Sewers; Hand
## 4                                                  Tax Preparers
## 5  Photographic Process Workers and Processing Machine Operators
## 6                                       Mathematical Technicians
## 7                    Title Examiners; Abstractors; and Searchers
## 8                                            Library Technicians
## 9                                                  Telemarketers
## 10                                           New Accounts Clerks
##    probability total_US a_median
## 1         0.99     1090    36740
## 2         0.99    90310    67680
## 3         0.99     5170    24520
## 4         0.99    63600    36550
## 5         0.99    26200    26470
## 6         0.99      220    49660
## 7         0.99    51810    45800
## 8         0.99    93400    32890
## 9         0.99   214570    24300
## 10        0.99    41430    34990

Jobs with the highest probability of being automated are technical, but repetitive. I don’t see a clear trend with respect to job freqency or income.

# and jobs with the lowest risk:
low_risk = US_jobs %>%
  arrange(probability) %>%
  slice(1:10)
low_risk
##                                                        occupation
## 1                                         Recreational Therapists
## 2  First-Line Supervisors of Mechanics; Installers; and Repairers
## 3                                  Emergency Management Directors
## 4                Mental Health and Substance Abuse Social Workers
## 5                                                    Audiologists
## 6                                       Healthcare Social Workers
## 7                                         Occupational Therapists
## 8                                     Orthotists and Prosthetists
## 9                                 Oral and Maxillofacial Surgeons
## 10 First-Line Supervisors of Fire Fighting and Prevention Workers
##    probability total_US a_median
## 1       0.0028    18080    46410
## 2       0.0030   453360    63540
## 3       0.0030     9540    70500
## 4       0.0031   111370    42700
## 5       0.0033    11830    75980
## 6       0.0035   159340    53760
## 7       0.0035   118060    81910
## 8       0.0035     6260    65630
## 9       0.0036     3450        0
## 10      0.0036    56980    74540

A lot of the jobs with lowest probability of being automated are technical and involve human interaction, like social workers and therapists. The salaries in this group tend to be better paid at >50k, while the high risk jobs are mostly <50k.

Let’s look at the job freq numbers:

# Job Frequency:
# histogram
hist(US_jobs$total_US, 
     main = 'Histogram of Job Frequency',
     xlab = 'Job Frequency')

# call ecdf plot
ecdf_plot(US_jobs$total_US,'Job Frequency ECDF','Median Annual Income','ecdf')

The frequency of job types are strongly skewed to the right. There’s a few jobs with >2 million people, but the vast majority look to be around 100,000. Specialization in action.

# Get occupations with most workers
common_jobs = US_jobs %>%
  arrange(desc(total_US)) %>%
  slice(1:10)
common_jobs
##                                                                         occupation
## 1                                                              Retail Salespersons
## 2                                                                         Cashiers
## 3               Combined Food Preparation and Serving Workers; Including Fast Food
## 4                                                           Office Clerks; General
## 5                                                 Customer Service Representatives
## 6                           Laborers and Freight; Stock; and Material Movers; Hand
## 7                                                           Waiters and Waitresses
## 8  Secretaries and Administrative Assistants; Except Legal; Medical; and Executive
## 9                                                  General and Operations Managers
## 10                   Janitors and Cleaners; Except Maids and Housekeeping Cleaners
##    probability total_US a_median
## 1         0.92  4528570    22680
## 2         0.97  3540980    20180
## 3         0.92  3426090    19440
## 4         0.96  2955560    30580
## 5         0.55  2707030    32300
## 6         0.85  2587940    25980
## 7         0.94  2564620    19990
## 8         0.96  2295480    34820
## 9         0.16  2188870    99310
## 10        0.66  2161710    24190

A lot of the most common jobs have a very high probability of being automated and low median income. A lot of service jobs.

# and jobs with the least workers
uncommon_jobs = US_jobs %>%
  arrange(total_US) %>%
  slice(1:10)
uncommon_jobs
##                                occupation probability total_US a_median
## 1            Computer Support Specialists      0.6500        0    52160
## 2                  Postsecondary Teachers      0.0320        0    66500
## 3                         Prosthodontists      0.0550        0   126050
## 4                 Physicians and Surgeons      0.0042        0        0
## 5      Miscellaneous Agricultural Workers      0.8700        0    22520
## 6                Cooks; Private Household      0.3000       30    32060
## 7     Fishers and Related Fishing Workers      0.8300       40    27110
## 8  Timing Device Assemblers and Adjusters      0.9800      130    37040
## 9                Mathematical Technicians      0.9900      220    49660
## 10                     Model Makers; Wood      0.9600      280    40890

It’s not clear to me why there are 0 workers for some of these jobs. For the the physicans and surgeons, I think they are paid hourly and thus the BLS doesn’t have a median annual income. I don’t see a clear trend between job rarity and risk of automation and median salary.

Let’s look at median annual incomes:

# histogram
hist(US_jobs$a_median,
     main = 'Histogram of Median Annual Income',
     xlab = 'Median Annual Income')

# call ecdf plot
ecdf_plot(US_jobs$a_median,'Median Annual Income ECDF','Median Annual Income','ecdf')

The median incomes look right skewed, which sounds right. Most jobs pay less than $50k, few jobs pay <150k. The ecdf is interesting, you can see the minimum wage effect as there’s no median annual between $0-15k or so.

# Let's find which jobs have the highest median annual income
high_income = US_jobs %>%
  arrange(desc(a_median)) %>%
  slice(1:10)
high_income
##                                        occupation probability total_US
## 1                                Chief Executives      0.0150   223270
## 2                               Dentists; General      0.0044   105650
## 3       Computer and Information Systems Managers      0.0350   352540
## 4          Architectural and Engineering Managers      0.0170   177540
## 5                              Marketing Managers      0.0140   205900
## 6                             Petroleum Engineers      0.1600    31510
## 7  Airline Pilots; Copilots; and Flight Engineers      0.1800    66850
## 8                                 Prosthodontists      0.0550        0
## 9      Judges; Magistrate Judges; and Magistrates      0.4000    24720
## 10                                    Podiatrists      0.0046     9320
##    a_median
## 1    181210
## 2    153900
## 3    135800
## 4    134730
## 5    131180
## 6    128230
## 7    127820
## 8    126050
## 9    125880
## 10   124830

CEOs, dentists, and STEM managers, sounds about right. Also, most of them have very low risk of being automated.

# and which jobs have the lowest median annual income
low_income = US_jobs %>%
  arrange(a_median) %>%
  slice(1:10)
low_income
##                                                            occupation
## 1                                                              Actors
## 2                                                             Dancers
## 3                                               Musicians and Singers
## 4                                     Oral and Maxillofacial Surgeons
## 5                                                       Orthodontists
## 6                                             Physicians and Surgeons
## 7                                                      Gaming Dealers
## 8  Combined Food Preparation and Serving Workers; Including Fast Food
## 9                                                          Shampooers
## 10                                                   Cooks; Fast Food
##    probability total_US a_median
## 1       0.3700    46770        0
## 2       0.1300     6840        0
## 3       0.0740    38860        0
## 4       0.0036     3450        0
## 5       0.0230     3800        0
## 6       0.0042        0        0
## 7       0.9600    83170    19290
## 8       0.9200  3426090    19440
## 9       0.7900    14390    19700
## 10      0.8100   513200    19860

I think artists don’t get a regular paycheck, but rather get paid per gig, which explain why the NBL doesn’t have an estimate for median annual income. Either that or the median is 0 and there’s a bunch of starving artists. Food prep workers are not highly paid, that sounds right. Also, check out the probability of automation for dealers, interesting.

Taken together, it would seems there might be a relationship between automation risk and income, and possibly # of workers. This can evaluated visually with some simple plots. First, let’s look at probability and income.

# Make scatter plot comparing probability of automation and income:
# using raw data
ggplot(US_jobs, aes(x=probability, y=a_median)) + 
  geom_point() +
  theme_classic() + 
  labs(title = "US Jobs - Automation Risk by Median Annual Income", y = "Median Annual Income ($)", x = "Probability Automation Risk")

We knew from before that the probability data is highly bimodal, and the income was right skewed. That being said, it looks like the values on the right are lower than those on the left. I’m going to try plotting the salary by percent rank and see if that gives some clarity:

# percent rank each variable for scaling
US_jobs = US_jobs %>%
  mutate(p_job_freq = percent_rank(total_US)) %>%
  mutate(p_income = percent_rank(a_median))
  
# using percent ranked income:
ggplot(US_jobs, aes(x=probability, y=p_income)) + 
  geom_point() +
  theme_classic() + 
  labs(title = "US Jobs - Automation Risk by Median Annual Income", y = "Percentile Median Annual Income", x = "Probability Automation Risk")

Looking at income as percent ranked, it looks clear that a lot of those with higher salaries also have lower risk of probability. This is probably the high skill/ high pay crowd. I can’t help but feel that this chart indicates the divide between ‘haves’ and ‘have nots’ will likely widen as automation is adopted.

Those with lower risk of automation appear to have higher median annual incomes, a testable hypothesis. Instead of running a regression, it may be easier to interpret if I divide the samples into groups and compare. I know from the histogram that the data is bimodal, and I really want to compare the high risk and low risk groups, so I’m going to define 3 groups, and compare the high and low risk group respectively.

But first, Let’s look at the relationship between risk of automation and job frequency:

# Make scatter plot comparing probability of automation and income:
# using raw data
ggplot(US_jobs, aes(x=probability, y=total_US)) + 
  geom_point() +
  theme_classic() + 
  labs(title = "US Jobs - Automation Risk by Job Title Frequency", y = "Job Title Frequency", x = "Probability Automation Risk")

I think job frequency and risk of automation are independent variables. That being, it does appear that several of the most frequent jobs (>2E6) have a high risk of automation, which could have significant economic displacement.

Relationship between income and job frequency:

# using raw data
ggplot(US_jobs, aes(x=total_US, y=a_median)) + 
  geom_point() +
  theme_classic() + 
  labs(title = "US Jobs - Job Title Frequency by Median Annual Income", x = "Job Title Frequency", y = "Median Annual Income")

I already knew from the histograms that both of these features were right skewed, and that is obvious here.

Now I’m going to add an extra layer of info here by coloring the top and bottom tenths of risk of automation.

# engineer new features: 
US_jobs$risk_group = 'mid'
US_jobs$risk_group[US_jobs$probability < 0.1] ='low'
US_jobs$risk_group[US_jobs$probability > 0.9] = 'high'
US_jobs$risk_group <- as.factor(US_jobs$risk_group)

# plot with fill
ggplot(US_jobs, aes(x=total_US, y=a_median, fill=risk_group)) + 
  geom_point(aes(color=risk_group)) +
  scale_fill_viridis(discrete=T) +
  theme_classic() + 
  labs(title = "US Jobs - Job Title Frequency by Median Annual Income", x = "Job Title Frequency", y = "Median Annual Income")

Hmm, the custom color isn’t working, but even so it looks like the low risk of automation jobs tend to have higher salaries. A boxplot would visualize this better.

# using raw data
ggplot(US_jobs, aes(x=reorder(risk_group, probability), y=a_median, fill=risk_group)) + 
  geom_boxplot() +
  scale_fill_viridis(discrete=T) +
  theme_classic() + 
  labs(title = "US Jobs - Automation Risk Group by Median Annual Income", x = "Automation Risk Group", y = "Median Annual Income")

Although there are outliers, it appears that jobs with greater than 90% risk of automation (high) have lower median annual incomes than those with less than 10% risk of automation (low). Because the data isn’t normally distributed, we can test the hypothesis using the non-parametric Mann-Whitney U test.

test = US_jobs[which(US_jobs$risk_group!='mid'),] # drop mid group
wilcox.test(test$a_median~test$risk_group)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  test$a_median by test$risk_group
## W = 3512.5, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

Boom, reject the null hypothesis! Those with jobs least affected by automation are earning more than those who will be displaced. It’s interesting, and indicates there are some objectively wise career choices one should consider. High pay and non-automatable, those are jobs to pursue at the present moment. I’ll figure out exactly what those are in a little bit.

But first, It’s not all doom and gloom for those getting displaced. It’s reasonable to assume that as automation takes over low-end jobs, new opportunities will emerge. However, there’s likely to be some ‘turbulance’ as displaced workers figure out the new economy.

I want to estimate the scale of economic disruption. How many US jobs are at risk, all told? And how much current income will be displaced by automation? Let’s find out:

# First, I'm going to calculate a few new columns to answer these questions
# 1) the # of US jobs at risk, by profession
df$jobs_at_risk = df$probability * df$total_US
df$current_income = df$total_US * df$a_median
df$income_at_risk = df$jobs_at_risk * df$a_median

# save and print the sums
total_jobs = sum(df$total_US)
print(c('Current number of jobs:',total_jobs))
## [1] "Current number of jobs:" "125099130"
total_jobs_at_risk = sum(df$jobs_at_risk)
print(c('Estimated number of automatable jobs:',total_jobs_at_risk))
## [1] "Estimated number of automatable jobs:"
## [2] "75757157.488"
print(c('% Current jobs displaced:',total_jobs_at_risk*100/total_jobs))
## [1] "% Current jobs displaced:" "60.5577013109524"

Based on these estimates 75,757,158 US jobs can be automated (~60% of all current jobs). This seems really high.

Let’s look at how much money will be displaced:

# Next, calculate the total income and income at risk
total_income = sum(df$current_income)
print(c('Current income:',total_income))
## [1] "Current income:" "5457014122300"
total_income_at_risk = sum(df$income_at_risk)
print(c('Estimated income displaced by automation:',total_income_at_risk))
## [1] "Estimated income displaced by automation:"
## [2] "2572185783759.71"
print(c('% of current annual income displaced by automation:',total_income_at_risk*100/total_income))
## [1] "% of current annual income displaced by automation:"
## [2] "47.135406398318"

Based on these estimates $25 Trillion (~47% of cumulative current annual incomes) will be displaced. That’s almost half. That’s huge.

That’s going to have a big impact on taxes, as well as the economy at large. If people don’t have money to spend, how will there be demand? If people aren’t working, they can’t payback taxes and debt. That’s going to cause a lot of turbulance.

Yeesh, let’s keep digging. I want to revist the question about which jobs are going to continue to do well, as well as the jobs that are going to get hit hardest.

Because the data is broken by state and occupation, I’d like to look at: 1) which jobs are going be most disrupted, and which will remain stable? 2) which states are going to be affected most and least by automation? 3) which jobs in each state are going to get hardest hit?

  1. which jobs are going be most disrupted, and which will remain stable? There’s a couple ways to ask this. For now, I want to focus on number of jobs displaced.
  1. Across total US, which jobs will displace most workers?
  2. And how much money will that displace?
  3. which will be most least affected? What will be the highest paying among those?

First, I’ll make use of the jobs_at_risk variable I made up above.

risky_jobs = df %>%
  mutate(job_p = percent_rank(jobs_at_risk)) %>%
  arrange(desc(job_p)) %>%
  slice(1:10)

# plot top jobs and associated income
jobs <- ggplot(risky_jobs, aes(x = reorder(occupation, jobs_at_risk), y = jobs_at_risk, fill = jobs_at_risk)) + 
  geom_bar(stat="identity") + 
  coord_flip() + 
  scale_fill_viridis() +
  theme_classic() + 
  theme(legend.position = "none") +
  labs(title = "Most Jobs Displaced", y = "Jobs at Risk", x = "Occupation")

income <- ggplot(risky_jobs, aes(x = reorder(occupation, jobs_at_risk), y = income_at_risk, fill = jobs_at_risk)) + 
  geom_bar(stat="identity") + 
  coord_flip() + 
  scale_fill_viridis() +
  theme_classic() + 
  theme(legend.position = "none") +
  labs(title = "Associated Income", y = "Annual Income Displaced ($)", x = "Occupation")

multiplot(jobs, income)
## Loading required package: grid

# x axes should be improved (scale, add M or B respectively)

How does it change when we look for jobs that are going to displace the most income? And how many jobs will that displace?

risky_income = df %>%
  mutate(income_p = percent_rank(income_at_risk)) %>%
  arrange(desc(income_p)) %>%
  slice(1:10)

# plot top incomes and associated jobs
income <- ggplot(risky_income, aes(x = reorder(occupation, income_at_risk), y = income_at_risk, fill = income_at_risk)) + 
  geom_bar(stat="identity") + 
  coord_flip() + 
  scale_fill_viridis() +
  theme_classic() + 
  theme(legend.position = "none") +
  labs(title = "Top Income Displacement", y = "Annual Income Displaced ($)", x = "Occupation")

jobs <- ggplot(risky_income, aes(x = reorder(occupation, income_at_risk), y = jobs_at_risk, fill = income_at_risk)) + 
  geom_bar(stat="identity") + 
  coord_flip() + 
  scale_fill_viridis() +
  theme_classic() + 
  theme(legend.position = "none") +
  labs(title = "Associated Jobs", y = "Jobs at Risk", x = "Occupation")

multiplot(income, jobs)

  1. Taken together, this suggests point-of-sales jobs like retail and cashiers will be most disrupted. This sounds reasonable, brick and mortar stores are dying and Amazon and Alibaba are thriving.

I think it’s interesting that when we look at it by income, some ‘white collar’ jobs show up, like Accountants and Auditors. That’s not surprising, software will make it easier to track process data.

Next, I’d like to look at which jobs are predicted to have low risk of automation and high pay:

stable_jobs = df %>%
  filter(probability < 0.1 & total_US > 10000) %>% #added a filter to remove exceedingly rare jobs
  arrange(desc(a_median)) %>%
  slice(1:10)

# plot top jobs and associated income
jobs <- ggplot(stable_jobs, aes(x = reorder(occupation, a_median), y = jobs_at_risk, fill = a_median)) + 
  geom_bar(stat="identity") + 
  coord_flip() + 
  scale_fill_viridis() +
  theme_classic() + 
  theme(legend.position = "none") +
  labs(title = "Low Automation, High-Salary Jobs", y = "Jobs at Risk", x = "Occupation")

income <- ggplot(stable_jobs, aes(x = reorder(occupation, a_median), y = a_median, fill = a_median)) + 
  geom_bar(stat="identity") + 
  coord_flip() + 
  scale_fill_viridis() +
  theme_classic() + 
  theme(legend.position = "none") +
  labs(title = "Median Annual Income", y = "Median Annual Income", x = "Occupation")

multiplot(jobs, income)

Nearly every entry on this list requires a 4-year degree, if not a graduate degree. TL;DR - Management or Medicine, if you’re planning on going to college then that’s what you want to aim for. That’s probably been true for at least 20 years, so not that surprising. I would have thought more tech jobs would be up there.

  1. which states are going to be affected most and least by automation?

I want to see how the displacement of jobs and income will be distributed across the US, by state:

# get jobs per state
st_jobs = df[,c(5:55)]
# calculate income per state
st_income = st_jobs * df$a_median
# calculate jobs at risk per state
st_jobs_at_risk = st_jobs * df$probability
# calculate income at risk per state
st_income_at_risk = st_jobs_at_risk * df$a_median

# label function
stateLabels = function(x){
  rownames(x) = df$occupation
  # transpose for mapping and add state code
  x = data.frame(t(x))
  x = cbind(code = c( "AL",
        "AK",
        "AZ",
        "AR",
        "CA",
        "CO",
        "CT",
        "DE",
        "DC",
        "FL",
        "GA",
        "HI",
        "ID",
        "IL",
        "IN",
        "IA",
        "KS",
        "KY",
        "LA",
        "ME",
        "MD",
        "MA",
        "MI",
        "MN",
        "MS",
        "MO",
        "MT",
        "NE",
        "NV",
        "NH",
        "NJ",
        "NM",
        "NY",
        "NC",
        "ND",
        "OH",
        "OK",
        "OR",
        "PA",
        "RI",
        "SC",
        "SD",
        "TN",
        "TX",
        "UT",
        "VT",
        "VA",
        "WA",
        "WV",
        "WI",
        "WY"),x)
  x
}

# label dfs
st_jobs = stateLabels(st_jobs)
st_income = stateLabels(st_income)
st_jobs_at_risk = stateLabels(st_jobs_at_risk)
st_income_at_risk = stateLabels(st_income_at_risk)

# calculate total jobs by state
st_jobs$total <- rowSums(st_jobs[,2:689])
# calculate total jobs at risk by state
st_jobs_at_risk$total <- rowSums(st_jobs_at_risk[,2:689])
# calculate total income by state
st_income$total <- rowSums(st_income[,2:689])
# calculate total income at risk by state
st_income_at_risk$total <- rowSums(st_income_at_risk[,2:689])
# calculate scaled job risk (jobs at risk/total jobs per state), do element-wise division. This yields the % of state workforce at risk, indicating which jobs will be most displaced by state, and which states will have most displacement by job
st_jobs_risk_scaled = cbind(st_jobs$code, 100*st_jobs_at_risk[,2:690]/ st_jobs$total)
# calculate scaled income risk (income at risk/total jobs per state), do element-wise division. This yields the median annual income/worker at risk, indicating the relative economic impact of automation.
st_income_risk_scaled = cbind(st_jobs$code, st_income_at_risk[,2:690]/ st_jobs$total)

total = data.frame(cbind(st_jobs$code,st_jobs$total,st_jobs_at_risk$total,st_income$total,st_income_at_risk$total))
colnames(total) = c('current_jobs', 'jobs_at_risk', 'current_income', 'income_at_risk')
rownames(total) = rownames(st_jobs)

# saving dfs for tableau work later
write.csv(st_jobs, "current state jobs.csv")
write.csv(st_income, "current state income.csv")
write.csv(st_jobs_at_risk, "state jobs at risk.csv")
write.csv(st_income_at_risk, "state income at risk.csv")
write.csv(st_jobs_risk_scaled, "state jobs at risk - scaled to current state jobs.csv")
write.csv(st_income_risk_scaled, "state income at risk - scaled to current state jobs.csv")
write.csv(total, "state totals.csv")

OK, now I’ve made my data, it’s time for EDA. I’m going to start with the jobs at risk, and I’m going to use the scaled data. But first, let’s get some summary stats.

# Now let's do some stats on the relative impact:
print(c('Median % of US Workforce Displacement:',median(st_jobs_risk_scaled$total)))
## [1] "Median % of US Workforce Displacement:"
## [2] "61.3095593010991"
print(c('Minimum % of Workforce Displaced:',min(st_jobs_risk_scaled$total)))
## [1] "Minimum % of Workforce Displaced:" "44.5093591967443"
print(c('Place with lowest workforce displacement:',rownames(st_jobs_risk_scaled)[st_jobs_risk_scaled$total == min(st_jobs_risk_scaled$total)]))
## [1] "Place with lowest workforce displacement:"
## [2] "district_of_columbia"
print(c('Maximum % of US Workforce Displaced:',max(st_jobs_risk_scaled$total)))
## [1] "Maximum % of US Workforce Displaced:"
## [2] "66.4547518066372"
print(c('Place with highest workforce displacement:',rownames(st_jobs_risk_scaled)[st_jobs_risk_scaled$total == max(st_jobs_risk_scaled$total)]))
## [1] "Place with highest workforce displacement:"
## [2] "nevada"

61% sounds really high. I remember reading a McKinnsey study saying which estimated around 47%.

Also, notice the place least affected is DC at 44%. What is this, the Hunger Games? Rationally speaking, government officials and lobbyists tend to perform highly specialized work which depends on personal connections. That being said, my first impression is that it seems ironic that the people with the most power already will be least affected.

NV is the hardest hit, with 66% of its workforce. I would hazard a guess that a lot of the service industry will be automated, meaning a lot of casino, hotel, and restaurant workers will be displaced.

Let’s get an idea of how this is distributed. A simple bar plot might help get an idea of how the data looks.

# plot workforce displacement by state code
ggplot(st_jobs_risk_scaled, aes(x = reorder(st_jobs$code, total), y = total, fill = total)) + 
  geom_bar(stat="identity") +
  scale_fill_viridis() +
  theme_classic() + 
  theme(legend.position = "none") +
  labs(title = "Workforce Displacement by Location", y = "% Workforce Displaced", x = "Location")

It lOoks like NV and SD will have the largest % of their workforce displaced, and DC is a bit of an outlier.

I can get the geographic distribution by mapping it with plotly.

# load plotly user name and apikey if saving to plotly.
# call choropleth, plot scaled % jobs at risk of automation
# give state boundaries a white border
l <- list(color = toRGB("white"), width = 2)
# specify some map projection/options
g <- list(
  scope = 'usa',
  projection = list(type = 'albers usa'),
  showlakes = TRUE,
  lakecolor = toRGB('white'))

p <- plot_geo(st_jobs_risk_scaled, locationmode = 'USA-states') %>% 
  add_trace(
    z = ~total,
    locations = ~st_jobs$code,
    color = ~total,
    colors = viridis_pal(alpha = 1, begin = 0, end = 1, direction = -1,
  option = "B")(20)
    ) %>%
  colorbar(title = "Jobs") %>%
  layout(
    title = 'Estimated Jobs At Risk of Automation',
    geo = g
  )
p
# lame, it doesn't render in the notebook.
# render online:
chart_link = api_create(p, filename="choropleth-Percent-Jobs-Automation-US")
## Found a grid already named: 'choropleth-Percent-Jobs-Automation-US Grid'. Since fileopt='overwrite', I'll try to update it
## Found a plot already named: 'choropleth-Percent-Jobs-Automation-US'. Since fileopt='overwrite', I'll try to update it
chart_link
# see the plot here: https://plot.ly/~jimmyjamesarnold/7

Now let’s explore the scaled income data:

# Now let's do some stats on the relative impact:
print(c('Median Annual Income Displaced per Worker:',median(st_income_risk_scaled$total)))
## [1] "Median Annual Income Displaced per Worker:"
## [2] "20643.5998596482"
print(c('Minimum Annual Income Displaced per Worker:',min(st_income_risk_scaled$total)))
## [1] "Minimum Annual Income Displaced per Worker:"
## [2] "18245.4062832244"
print(c('Place with lowest income displacement:',rownames(st_income_risk_scaled)[st_income_risk_scaled$total == min(st_income_risk_scaled$total)]))
## [1] "Place with lowest income displacement:"
## [2] "district_of_columbia"
print(c('Maximum Annual Income Displaced per Worker:',max(st_income_risk_scaled$total)))
## [1] "Maximum Annual Income Displaced per Worker:"
## [2] "22066.3967868333"
print(c('Place with highest income displacement:',rownames(st_income_risk_scaled)[st_income_risk_scaled$total == max(st_income_risk_scaled$total)]))
## [1] "Place with highest income displacement:"
## [2] "south_dakota"

Hey, those look familiar! SD has the 2nd most job displacement, just after NV. And DC had the lowest job displacement, so it would seem that DC is going to weather automation fairly well.

Let’s see the barplot:

# plot workforce displacement by state code
ggplot(st_income_risk_scaled, aes(x = reorder(st_jobs$code, total), y = total, fill = total)) + 
  geom_bar(stat="identity") +
  scale_fill_viridis() +
  theme_classic() + 
  theme(legend.position = "none") +
  labs(title = "Annual Income Displacement per Worker", y = "Annual Income Displacement", x = "Location")

Interesting, it looks like DC, MA, MD, and CT are going to be the least affected, whereas the ND, SD, and WY are going to be hit hardest.

Let’s get the map with plotly.

# call choropleth, plot scaled % jobs at risk of automation
# give state boundaries a white border
l <- list(color = toRGB("white"), width = 2)
# specify some map projection/options
g <- list(
  scope = 'usa',
  projection = list(type = 'albers usa'),
  showlakes = TRUE,
  lakecolor = toRGB('white'))

p <- plot_geo(st_income_risk_scaled, locationmode = 'USA-states') %>% 
  add_trace(
    z = ~total,
    locations = ~st_jobs$code,
    color = ~total,
    colors = viridis_pal(alpha = 1, begin = 0, end = 1, direction = -1,
  option = "B")(20)
    ) %>%
  colorbar(title = "Jobs") %>%
  layout(
    title = 'Estimated Income Displaced per Worker at Risk of Automation',
    geo = g
  )
p
# lame, it doesn't render in the notebook.

# render online:
#chart_link = api_create(p, filename="choropleth-Income-Displaced-per-Worker-Automation-US")
#chart_link
# see the plot here: https://plot.ly/~jimmyjamesarnold/9